使用RCNN进行文本分类(Kaggle比赛Quora Insincere Questions Classification)

参考文献1:《Recurrent Convolutional Neural Networks for Text Classification》

首先,直接看模型图:

Alt text

根据模型图,可以明显看出,是要用了Bi-RNN + Max-Pooling + 全连接。

根据图的公式为:
假设有一个句话D,由$w_{1}, w_{2}, \ldots, w_{n}$个单词组成,使用$p(k | D, \theta)$表示句子为类K,其中$\theta$为超参数。分别使用$\boldsymbol{c}_{l}\left(\boldsymbol{w}_{i}\right)$、$\boldsymbol{c}_{r}\left(\boldsymbol{w}_{i}\right)$分别表示单词$w_i$的左右上下文,$e\left(w_{i-1}\right)$表示词的$w_{i-1}$词向量。
最终我们有这样两个式子,来表示Bi-RNN的正反两层:

然后进行连接操作:

经过一个非线性变化:

然后对所有元素进行max-pooling:

最后接一个softmax完成分类:

这样子就将RNN和CNN结合到一起去了。

这篇论文中有一个需要注意的点,比如下面这句话:
[I like NLP]
[w1 w2 w3]
然后将这个序列输入到left RNN中,首先是w1输入进行,根据上面的公式,我们可以知道得到的结果是$c_l(w_2)$,而在最后连接层的时候,我们就不太方便将[$c_l(w_2)$;$w_2$;$c_r(w_2)$]连接起来,所以这里我们进行一个处理,将left context处理成[UNK w1 w2]。同理在right context中我们也需要处理成[w2 w3 UNK]

实现

对于上述的论文,我们使用keras进行模型复现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def loadRCNNModel(max_features, embed_size, embedding_matrix=None):
hidden_size1 = 100
hidden_size2 = 200
doc = Input(shape=(max_len,), dtype="int32")
left_context = Input(shape=(max_len,), dtype="int32")
right_context = Input(shape=(max_len,), dtype="int32")
if embedding_matrix is None:
embedding = Embedding(max_features, embed_size)
else:
embedding = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)
doc_embedding = embedding(doc)
left_embedding = embedding(left_context)
right_embedding = embedding(right_context)

forward = LSTM(hidden_size1, return_sequences=True)(left_embedding)
backward = LSTM(hidden_size1, return_sequences=True, go_backwards=True)(right_embedding)

backward = Lambda(lambda x: backend.reverse(x, axes=1))(backward)
together = concatenate([forward, doc_embedding, backward], axis=2)

semantic = Conv1D(hidden_size2, kernel_size=1, activation="tanh")(together)
pool_rnn = Lambda(lambda x: backend.max(x, axis=1), output_shape=(hidden_size2,))(semantic)

output = Dense(1, input_dim=hidden_size2, activation="sigmoid")(pool_rnn)
model = Model(inputs=[doc, left_context, right_context], outputs=output)
return model

这个模型可以对应着上面的那张图看。

同时我们还需要对数据进行适当的预处理,具体可以参考上面注意的点,我们用下面这个方法进行实现:

1
2
3
4
def handle_context(doc):
left_context = np.concatenate((np.zeros((len(doc), 1), dtype=np.int32), doc[:, 0:-1]), axis=1)
right_context = np.concatenate((doc[:, 1:], np.zeros((len(doc), 1))), axis=1)
return [doc, left_context, right_context]

最终整体代码为

(数据可以去kaggle QIQC比赛获取)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

print(os.listdir("../input"))

from keras import backend
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Dense, Input, Dropout, Conv1D
from keras.layers import LSTM, CuDNNLSTM, Lambda
from keras.layers import Bidirectional
from keras.layers import GlobalMaxPooling1D
from keras import Model
from keras.layers.merge import concatenate
from sklearn.model_selection import train_test_split
from sklearn import metrics

max_len = 100
max_feature = 50000
embed_size = 300


def load_and_prec(val_size=0.1):
"""
读取数据集和词表
"""
df_train = pd.read_csv("../input/train.csv") # .iloc[0:100, :]
df_train, df_val = train_test_split(df_train, test_size=val_size, random_state=666)
df_test = pd.read_csv("../input/test.csv")
# fill up the missing values
x_train = df_train["question_text"].fillna("_##_").values
x_val = df_val["question_text"].fillna("_##_").values
x_test = df_test["question_text"].fillna("_##_").values

# Tokenize the sequences
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

# pad the sequence
x_train = pad_sequences(x_train, maxlen=max_len)
x_val = pad_sequences(x_val, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

# get the target value
y_train = df_train['target'].values
y_val = df_val['target'].values

return x_train, y_train, x_val, y_val, x_test, tokenizer.word_index


def find_best_prob(model, x_val, y_val, batch_size=512):
y_pred = model.predict(x_val, batch_size=batch_size, verbose=1)
best_f = 0
best_p = 0
for thresh in np.arange(0.1, 0.501, 0.01):
thresh = np.round(thresh, 2)
f1 = metrics.f1_score(y_val, (y_pred > thresh).astype(int))
if f1 > best_f:
best_f = f1
best_p = thresh
print("F1 score at threshold {0} is {1}".format(thresh, f1))
print("best param:", best_p, "F1:", best_f)
return best_p


def load_glove(word_index, max_feature):
"""
加载词向量
"""
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'

def get_coefs(word, *arr):
# 定义参数传入为元组,如果是两个**,则传入为字典
return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, "r", encoding="utf-8"))

# 获取均值和标准差
all_embs = np.stack(embedding_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

# 获取嵌入矩阵
embedding_matrix = np.random.normal(emb_mean, emb_std, (max_feature, embed_size))
for word, i in word_index.items():
if i >= max_feature:
continue
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix

def loadRCNNModel(max_features, embed_size, embedding_matrix=None):
hidden_size1 = 100
hidden_size2 = 200
doc = Input(shape=(max_len,), dtype="int32")
left_context = Input(shape=(max_len,), dtype="int32")
right_context = Input(shape=(max_len,), dtype="int32")
if embedding_matrix is None:
embedding = Embedding(max_features, embed_size)
else:
embedding = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)
doc_embedding = embedding(doc)
left_embedding = embedding(left_context)
right_embedding = embedding(right_context)

forward = LSTM(hidden_size1, return_sequences=True)(left_embedding)
backward = LSTM(hidden_size1, return_sequences=True, go_backwards=True)(right_embedding)

backward = Lambda(lambda x: backend.reverse(x, axes=1))(backward)
together = concatenate([forward, doc_embedding, backward], axis=2)

semantic = Conv1D(hidden_size2, kernel_size=1, activation="tanh")(together)
pool_rnn = Lambda(lambda x: backend.max(x, axis=1), output_shape=(hidden_size2,))(semantic)

output = Dense(1, input_dim=hidden_size2, activation="sigmoid")(pool_rnn)
model = Model(inputs=[doc, left_context, right_context], outputs=output)
return model


def handle_context(doc):
left_context = np.concatenate((np.zeros((len(doc), 1), dtype=np.int32), doc[:, 0:-1]), axis=1)
right_context = np.concatenate((doc[:, 1:], np.zeros((len(doc), 1))), axis=1)
return [doc, left_context, right_context]


if __name__ == '__main__':
print("load data...")
x_train, y_train, x_val, y_val, x_test, word_index = load_and_prec()
# embedding_matrix = load_glove(word_index, max_feature)
print("train...")
x_train = handle_context(x_train)
x_val = handle_context(x_val)
model = loadRCNNModel1(max_feature, embed_size)
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
model.fit(x_train, y_train,
validation_data=(x_val, y_val), batch_size=512, epochs=4,
verbose=2)

print("find best prop...")
best_prob = find_best_prob(model, x_val, y_val, batch_size=512)
x_test = handle_context(x_test)
print("predict...")
y_pred = model.predict(x_test)
sub = pd.read_csv("../input/sample_submission.csv")
sub.prediction = y_pred > best_prob
sub.to_csv("submission.csv", index=False)
print("end...")